{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Implementing CLSM - Keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Purpose\n",
    "The purpose of this notebook is to implement Microsoft's [Convolutional Latent Semantic Model](http://www.iro.umontreal.ca/~lisa/pointeurs/ir0895-he-2.pdf) in Keras, and evaluate it on our dataset.\n",
    "\n",
    "## Inputs\n",
    "- This notebook requires *wiki-pages* from the FEVER dataset as an input."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:11.417322Z",
     "start_time": "2018-11-09T16:01:10.510008Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import nltk\n",
    "import utils\n",
    "import pickle\n",
    "\n",
    "from scipy import sparse\n",
    "from joblib import Parallel, delayed\n",
    "from multiprocessing import cpu_count\n",
    "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n",
    "from tqdm import tqdm_notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:13.761700Z",
     "start_time": "2018-11-09T16:01:11.419683Z"
    }
   },
   "outputs": [],
   "source": [
    "claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data(\"../train.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:13.775775Z",
     "start_time": "2018-11-09T16:01:13.770916Z"
    }
   },
   "outputs": [],
   "source": [
    "def generate_all_tokens(arr):\n",
    "    all_tokens = []\n",
    "    for unprocessed_claim in tqdm_notebook(arr):\n",
    "        c = utils.preprocess_article_name(unprocessed_claim)\n",
    "        c = \"! {} !\".format(c)\n",
    "        for word in c.split():\n",
    "            letter_tuples = list(nltk.ngrams(\"#\" + word + \"#\", 3))\n",
    "            letter_grams = []\n",
    "            for l in letter_tuples:\n",
    "                letter_grams.append(\"\".join(l))\n",
    "            all_tokens.extend(letter_grams)\n",
    "    return all_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:23.686677Z",
     "start_time": "2018-11-09T16:01:13.777086Z"
    }
   },
   "outputs": [],
   "source": [
    "processed_claims = generate_all_tokens(claims)\n",
    "processed_claims.extend(generate_all_tokens(article_list))\n",
    "\n",
    "possible_tokens = list(set(processed_claims))\n",
    "\n",
    "encoder = LabelEncoder()\n",
    "encoder.fit(np.array(sorted(possible_tokens)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:23.729508Z",
     "start_time": "2018-11-09T16:01:23.690608Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5d2c05aac8dd438c91753264c590ee04",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "feature_encoder = {}\n",
    "for idx, e in tqdm_notebook(enumerate(encoder.classes_)):\n",
    "    feature_encoder[e] = idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:23.736325Z",
     "start_time": "2018-11-09T16:01:23.731564Z"
    }
   },
   "outputs": [],
   "source": [
    "def tokenize_claim(c, enc):\n",
    "    \"\"\"\n",
    "    Input: a string that represents a single claim\n",
    "    Output: a list of 3x|vocabulary| arrays that has a 1 where the letter-gram exists.\n",
    "    \"\"\"\n",
    "    encoded_vector = []\n",
    "    c = utils.preprocess_article_name(c)\n",
    "    c = \"! {} !\".format(c)\n",
    "    for ngram in nltk.ngrams(nltk.word_tokenize(c), 3):\n",
    "        arr = sparse.lil_matrix((3, len(enc.__dict__['classes_'])))\n",
    "        for idx, word in enumerate(ngram):\n",
    "            for letter_gram in nltk.ngrams(\"#\" + word + \"#\", 3):\n",
    "                s = \"\".join(letter_gram)\n",
    "                letter_idx = feature_encoder[s]\n",
    "                arr[idx, letter_idx] = 1\n",
    "        encoded_vector.append(arr)\n",
    "    return encoded_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:23.756777Z",
     "start_time": "2018-11-09T16:01:23.737822Z"
    }
   },
   "outputs": [],
   "source": [
    "load_processed_claims = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:39:49.933116Z",
     "start_time": "2018-11-09T14:39:47.833416Z"
    }
   },
   "outputs": [],
   "source": [
    "from tensorflow.python.client import device_lib\n",
    "print(device_lib.list_local_devices())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:01:23.772933Z",
     "start_time": "2018-11-09T16:01:23.758448Z"
    }
   },
   "outputs": [],
   "source": [
    "load_processed_claims = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T16:12:32.621485Z",
     "start_time": "2018-11-09T16:12:32.606040Z"
    }
   },
   "outputs": [],
   "source": [
    "import joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2018-11-09T16:12:33.170Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "487e3882e53744b4af3809382b377195",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=89997), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fe5885a6fd62471781362fe06d84d1a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "if load_processed_claims:\n",
    "    with open(\"all_data.pkl\", \"rb\") as f:\n",
    "        all_data = pickle.load(f)\n",
    "else:\n",
    "    all_data = []\n",
    "\n",
    "    article_set = set(article_list)\n",
    "    \n",
    "    def process_claim(idx):\n",
    "        J = 50\n",
    "        data = {}\n",
    "        data['claim'] = tokenize_claim(claims[idx], encoder)\n",
    "        data['positive_article'] = tokenize_claim(article_list[idx], encoder)\n",
    "        negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), J)\n",
    "        negative_articles = [tokenize_claim(i, encoder) for i in negative_articles]\n",
    "        for i in range(J):\n",
    "            data['negative_article_{}'.format(i)] = negative_articles[i]\n",
    "        return data\n",
    "\n",
    "    all_data = utils.parallel_process(range(90000), process_claim, n_jobs=6)\n",
    "    \n",
    "    joblib.dump(all_data, \"all_data_2.pkl\")\n",
    "    #with open(\"all_data_2.pkl\", \"wb\") as f:\n",
    "    #    pickle.dump(all_data, f)\n",
    "    #all_data = Parallel(n_jobs=cpu_count(), verbose=1, prefer=\"threads\")(delayed(process_claim)(i) for i in range(len(claims)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Beginning the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:46:19.702634Z",
     "start_time": "2018-11-09T14:46:15.657882Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%run deep_semantic_similarity_keras.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:41:20.745309Z",
     "start_time": "2018-11-09T14:41:20.647466Z"
    }
   },
   "outputs": [],
   "source": [
    "from scipy import sparse\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np\n",
    "from keras.utils import multi_gpu_model\n",
    "import keras\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:41:20.749209Z",
     "start_time": "2018-11-09T14:41:20.746849Z"
    }
   },
   "outputs": [],
   "source": [
    "load_processed_claims = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:41:26.958590Z",
     "start_time": "2018-11-09T14:41:20.753706Z"
    }
   },
   "outputs": [],
   "source": [
    "if load_processed_claims:\n",
    "    with open(\"saved_data.pkl\", \"rb\") as f:\n",
    "        data = pickle.load(f)\n",
    "else:\n",
    "    data = {\"claim\":[], \"positive_article\":[], \"negative_article_0\":[], \"negative_article_1\":[], \\\n",
    "            \"negative_article_2\":[], \"negative_article_3\":[]}\n",
    "\n",
    "    for d in tqdm_notebook(all_data):\n",
    "        data['claim'].append(sparse.vstack(d['claim']))\n",
    "        data['positive_article'].append(sparse.vstack(d['positive_article']))\n",
    "        data['negative_article_0'].append(sparse.vstack(d['negative_article_0']))\n",
    "        data['negative_article_1'].append(sparse.vstack(d['negative_article_1']))\n",
    "        data['negative_article_2'].append(sparse.vstack(d['negative_article_2']))\n",
    "        data['negative_article_3'].append(sparse.vstack(d['negative_article_3']))\n",
    "\n",
    "    with open(\"saved_data.pkl\", \"wb\") as f:\n",
    "        pickle.dump(data, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we work on training the model in a batchsize manner."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:41:26.997708Z",
     "start_time": "2018-11-09T14:41:26.994252Z"
    }
   },
   "outputs": [],
   "source": [
    "len(data['claim'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:41:27.018303Z",
     "start_time": "2018-11-09T14:41:26.999127Z"
    }
   },
   "outputs": [],
   "source": [
    "y = np.zeros((1, J+1))\n",
    "y[:,0] = 1\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:46:39.942593Z",
     "start_time": "2018-11-09T14:46:39.935743Z"
    }
   },
   "outputs": [],
   "source": [
    "data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:46:19.713063Z",
     "start_time": "2018-11-09T14:46:19.705207Z"
    }
   },
   "outputs": [],
   "source": [
    "class DataGenerator(keras.utils.Sequence):\n",
    "    \"\"\"\n",
    "    Generates data with batch size of 1 sample for the purposes of training our model.\n",
    "    \"\"\"\n",
    "    def __init__(self, data, J, batch_size=32, split=None):\n",
    "        \"\"\"\n",
    "            Sets the initial arguments and creates\n",
    "            an indicies array to randomize the dataset\n",
    "            between epochs\n",
    "        \"\"\"\n",
    "        if split:            \n",
    "            self.indicies = split\n",
    "        else:\n",
    "            self.indicies = list(range(len(data)))\n",
    "        self.data = data\n",
    "        self.J = J\n",
    "        self.batch_size = batch_size\n",
    "        \n",
    "    def __len__(self):\n",
    "        return int(np.floor(len(self.indicies) / self.batch_size))\n",
    "    \n",
    "    def __getitem__(self, index):\n",
    "        return self.get_item(index)\n",
    "    \n",
    "    def get_item(self, index):            \n",
    "            \n",
    "        final = {}\n",
    "        #idx = self.indicies[index*self.batch_size:(index+1)*self.batch_size]  # help randomly shuffle the dataset\n",
    "        idx = self.indicies[index]\n",
    "        for k in self.data.keys():\n",
    "            final[k] = np.expand_dims(self.data[k][idx].todense(),0)\n",
    "            #print(\"Stacking array {}\".format(k))\n",
    "            \n",
    "#             arrays = np.array(arrays)\n",
    "#             lens = np.array([len(i) for i in arrays])\n",
    "\n",
    "#             # Mask of valid places in each row\n",
    "#             mask = np.arange(lens.max()) < lens[:,None]\n",
    "\n",
    "#             # Setup output array and put elements from data into masked positions\n",
    "#             out = np.zeros(mask.shape, dtype=arrays.dtype)\n",
    "#             out[mask] = np.vstack(arrays)\n",
    "        \n",
    "            #final[k] = np.array(arrays)\n",
    "            \n",
    "        y = np.zeros((self.batch_size, self.J+1))\n",
    "        y[:,0] = 1\n",
    "\n",
    "        return final, y\n",
    "    \n",
    "    def on_epoch_end(self):\n",
    "        #np.random.shuffle(self.indicies)\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:46:19.726448Z",
     "start_time": "2018-11-09T14:46:19.715105Z"
    }
   },
   "outputs": [],
   "source": [
    "generator = DataGenerator(data, 4, batch_size=1, split=range(0, 90000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:46:20.014908Z",
     "start_time": "2018-11-09T14:46:19.968257Z"
    }
   },
   "outputs": [],
   "source": [
    "model.compile(loss=\"categorical_crossentropy\", optimizer=\"adadelta\", metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-09T14:46:31.106727Z",
     "start_time": "2018-11-09T14:46:20.688309Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "model.fit_generator(generator=generator, epochs=20, workers=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "1+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "parallel_model = multi_gpu_model(model, gpus=2)\n",
    "parallel_model.compile(loss='categorical_crossentropy',\n",
    "                       optimizer='adadelta')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reset_weights(model):\n",
    "    session = backend.get_session()\n",
    "    for layer in model.layers: \n",
    "        if hasattr(layer, 'kernel_initializer'):\n",
    "            layer.kernel.initializer.run(session=session)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reset_weights(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "validation = DataGenerator(data, J, split=range(90000, 125000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.evaluate_generator(generator=validation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in tqdm_notebook(range(len(data['claim']))):\n",
    "    batch = {\"claim\":[], \"positive_article\":[], \"negative_article_0\":[], \"negative_article_1\":[], \\\n",
    "        \"negative_article_2\":[], \"negative_article_3\":[]}\n",
    "    batch['claim'] = np.expand_dims(data['claim'][i].todense(), 0)\n",
    "    batch['positive_article'] = np.expand_dims(data['positive_article'][i].todense(), 0)\n",
    "    batch['negative_article_0'] = np.expand_dims(data['negative_article_0'][i].todense(), 0)\n",
    "    batch['negative_article_1'] = np.expand_dims(data['negative_article_1'][i].todense(), 0)\n",
    "    batch['negative_article_2'] = np.expand_dims(data['negative_article_2'][i].todense(), 0)\n",
    "    batch['negative_article_3'] = np.expand_dims(data['negative_article_3'][i].todense(), 0)\n",
    "    model.fit(batch, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(data, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "article_set = set(article_list)\n",
    "\n",
    "def process_claim(idx):\n",
    "    data = {}\n",
    "    data['claim'] = tokenize_claim(claims[idx], encoder)\n",
    "    data['positive_article'] = tokenize_claim(article_list[idx], encoder)\n",
    "    negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), J)\n",
    "    negative_articles = [tokenize_claim(i, encoder) for i in negative_articles]\n",
    "    data['negative_article'] = negative_articles\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "process_claim(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.argwhere(all_data[0]['claim'][0]==0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
